#include <iostream>
#include <cstdlib>
#include <cstring>
#include <stdlib.h> 
#include <cuda_runtime.h>

__global__ void Sort(int* arr, int size) {
	for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < size; i++) {
		int curr = arr[i];
		int j = i;
		for (j = i; j > 0 && arr[j - 1] > curr; j--) {
			arr[j] = arr[j - 1];
		}
		arr[j] = curr;
	}
}

//Generates random numbers and assigns them to the array
void fillArray(int* arr, int size) {
	for (int i = 0; i < size; i++) {
		arr[i] = rand() % size;
	}
}

//Performs Insertion Sort on the array passed through parameter
void insertionSort(int *arr, int size) {

	int* d_a = nullptr;
	cudaMalloc((void**)&d_a, size * sizeof(int));

	cudaMemcpy(d_a, arr, size * sizeof(int), cudaMemcpyHostToDevice);

	
	// launch a grid of 1 block of n threads
	Sort<<<1, size>>>(d_a,size);

	cudaDeviceSynchronize();

	// copy from device to host memory
	cudaMemcpy(arr, d_a, size * sizeof(int), cudaMemcpyDeviceToHost);

	cudaFree(d_a);
	cudaDeviceReset();

	/*
	for (int i = 0; i < size; i++) {
		int curr = arr[i];
		int j = i;
		for (j = i; j > 0 && arr[j - 1] > curr; j--) {
			arr[j] = arr[j - 1];
		}
		arr[j] = curr;
	}
	*/
}

void print(int *arr, int size) {
	for (int i = 0; i < size; i++) {
		std::cout << arr[i] << " ";
	}
	std::cout << std::endl;
}

int main(int argc, char *argv[]) {

	//Get the size of the array
	int n = std::atoi(argv[1]);

	// Create 6 arrays of size n and allocate memory for them
	int *insertionArray = new int[n];

	//Fill the array with randomly generated numbers
	fillArray(insertionArray, n);
	//print(bubbleArray, n);

	//Call the sorting algorithms 1 by 1 with their respecive array
	insertionSort(insertionArray, n);
	std::cout << "Insertion Sort performed." << std::endl;
	print(insertionArray, n);


	//Deallocate the arrays
	delete[] insertionArray;

	return 0;
}